# Load dataset and import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly .express as px
df = pd.read_csv(r"C:\Users\jayant soni\Downloads\penguins_lter (1).csv")
df.head()
| studyName | Sample Number | Species | Region | Island | Stage | Individual ID | Clutch Completion | Date Egg | Culmen Length (mm) | Culmen Depth (mm) | Flipper Length (mm) | Body Mass (g) | Sex | Delta 15 N (o/oo) | Delta 13 C (o/oo) | Comments | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PAL0708 | 1 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N1A1 | Yes | 11/11/07 | 39.1 | 18.7 | 181.0 | 3750.0 | MALE | NaN | NaN | Not enough blood for isotopes. |
| 1 | PAL0708 | 2 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N1A2 | Yes | 11/11/07 | 39.5 | 17.4 | 186.0 | 3800.0 | FEMALE | 8.94956 | -24.69454 | NaN |
| 2 | PAL0708 | 3 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N2A1 | Yes | 11/16/07 | 40.3 | 18.0 | 195.0 | 3250.0 | FEMALE | 8.36821 | -25.33302 | NaN |
| 3 | PAL0708 | 4 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N2A2 | Yes | 11/16/07 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Adult not sampled. |
| 4 | PAL0708 | 5 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N3A1 | Yes | 11/16/07 | 36.7 | 19.3 | 193.0 | 3450.0 | FEMALE | 8.76651 | -25.32426 | NaN |
#to detect outlier
df.describe()
| Sample Number | Culmen Length (mm) | Culmen Depth (mm) | Flipper Length (mm) | Body Mass (g) | Delta 15 N (o/oo) | Delta 13 C (o/oo) | |
|---|---|---|---|---|---|---|---|
| count | 344.000000 | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 330.000000 | 331.000000 |
| mean | 63.151163 | 43.921930 | 17.151170 | 200.915205 | 4201.754386 | 8.733382 | -25.686292 |
| std | 40.430199 | 5.459584 | 1.974793 | 14.061714 | 801.954536 | 0.551770 | 0.793961 |
| min | 1.000000 | 32.100000 | 13.100000 | 172.000000 | 2700.000000 | 7.632200 | -27.018540 |
| 25% | 29.000000 | 39.225000 | 15.600000 | 190.000000 | 3550.000000 | 8.299890 | -26.320305 |
| 50% | 58.000000 | 44.450000 | 17.300000 | 197.000000 | 4050.000000 | 8.652405 | -25.833520 |
| 75% | 95.250000 | 48.500000 | 18.700000 | 213.000000 | 4750.000000 | 9.172123 | -25.062050 |
| max | 152.000000 | 59.600000 | 21.500000 | 231.000000 | 6300.000000 | 10.025440 | -23.787670 |
#Missing value imputation
df.isnull().sum()
studyName 0 Sample Number 0 Species 0 Region 0 Island 0 Stage 0 Individual ID 0 Clutch Completion 0 Date Egg 0 Culmen Length (mm) 2 Culmen Depth (mm) 2 Flipper Length (mm) 2 Body Mass (g) 2 Sex 10 Delta 15 N (o/oo) 14 Delta 13 C (o/oo) 13 Comments 318 dtype: int64
df.notnull().sum()
studyName 344 Sample Number 344 Species 344 Region 344 Island 344 Stage 344 Individual ID 344 Clutch Completion 344 Date Egg 344 Culmen Length (mm) 342 Culmen Depth (mm) 342 Flipper Length (mm) 342 Body Mass (g) 342 Sex 334 Delta 15 N (o/oo) 330 Delta 13 C (o/oo) 331 Comments 26 dtype: int64
df.isna().mean()*100
studyName 0.000000 Sample Number 0.000000 Species 0.000000 Region 0.000000 Island 0.000000 Stage 0.000000 Individual ID 0.000000 Clutch Completion 0.000000 Date Egg 0.000000 Culmen Length (mm) 0.581395 Culmen Depth (mm) 0.581395 Flipper Length (mm) 0.581395 Body Mass (g) 0.581395 Sex 2.906977 Delta 15 N (o/oo) 4.069767 Delta 13 C (o/oo) 3.779070 Comments 92.441860 dtype: float64
fig = px.histogram(df, x='Culmen Depth (mm)')
fig.show()
fig=px.box(df,y=('Flipper Length (mm)'))
fig.show()
def find_outliers_IQR(df):
q1=df.quantile(0.25)
q3=df.quantile(0.75)
IQR=q3-q1
outliers = df[((df<(q1-1.2*IQR)) | (df>(q3+1.2*IQR)))]
return outliers
outliers = find_outliers_IQR(df["Culmen Depth (mm)"])
print("number of outliers: "+ str(len(outliers)))
print("max outlier value: "+ str(outliers.max()))
print("min outlier value: "+ str(outliers.min()))
outliers
number of outliers: 0 max outlier value: nan min outlier value: nan
Series([], Name: Culmen Depth (mm), dtype: float64)
#to find the correlation amang
df.corr(method ='pearson')
| Sample Number | Culmen Length (mm) | Culmen Depth (mm) | Flipper Length (mm) | Body Mass (g) | Delta 15 N (o/oo) | Delta 13 C (o/oo) | |
|---|---|---|---|---|---|---|---|
| Sample Number | 1.000000 | -0.236356 | -0.022352 | 0.040849 | -0.007042 | 0.006952 | -0.488690 |
| Culmen Length (mm) | -0.236356 | 1.000000 | -0.235053 | 0.656181 | 0.595110 | -0.059759 | 0.189025 |
| Culmen Depth (mm) | -0.022352 | -0.235053 | 1.000000 | -0.583851 | -0.471916 | 0.605874 | 0.429933 |
| Flipper Length (mm) | 0.040849 | 0.656181 | -0.583851 | 1.000000 | 0.871202 | -0.507787 | -0.376223 |
| Body Mass (g) | -0.007042 | 0.595110 | -0.471916 | 0.871202 | 1.000000 | -0.537888 | -0.374638 |
| Delta 15 N (o/oo) | 0.006952 | -0.059759 | 0.605874 | -0.507787 | -0.537888 | 1.000000 | 0.570615 |
| Delta 13 C (o/oo) | -0.488690 | 0.189025 | 0.429933 | -0.376223 | -0.374638 | 0.570615 | 1.000000 |
df.corr(method ='kendall')
| Sample Number | Culmen Length (mm) | Culmen Depth (mm) | Flipper Length (mm) | Body Mass (g) | Delta 15 N (o/oo) | Delta 13 C (o/oo) | |
|---|---|---|---|---|---|---|---|
| Sample Number | 1.000000 | -0.141843 | -0.029314 | 0.040947 | 0.009671 | 0.009710 | -0.254544 |
| Culmen Length (mm) | -0.141843 | 1.000000 | -0.122850 | 0.483345 | 0.433359 | -0.064553 | 0.097830 |
| Culmen Depth (mm) | -0.029314 | -0.122850 | 1.000000 | -0.281894 | -0.195070 | 0.424881 | 0.293145 |
| Flipper Length (mm) | 0.040947 | 0.483345 | -0.281894 | 1.000000 | 0.660467 | -0.316815 | -0.230067 |
| Body Mass (g) | 0.009671 | 0.433359 | -0.195070 | 0.660467 | 1.000000 | -0.372535 | -0.254730 |
| Delta 15 N (o/oo) | 0.009710 | -0.064553 | 0.424881 | -0.316815 | -0.372535 | 1.000000 | 0.362955 |
| Delta 13 C (o/oo) | -0.254544 | 0.097830 | 0.293145 | -0.230067 | -0.254730 | 0.362955 | 1.000000 |